Initial Setup¶
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import os
# Configure plotting
plt.rcParams.update({
'figure.figsize': [12, 8],
'figure.dpi': 150,
'figure.autolayout': True,
'axes.labelsize': 12,
'axes.titlesize': 14,
'font.size': 12
})
pathRaw = "./data_raw/"
pathFilter = "./data_filtered/"
pathProcessd = "./data_processed/"
pathVisuRaw = "./visu_raw/"
pathVisuProcessed = "./visu_processed/"
pathOnlyProcessed = "./visu_only_processed/"
pathProb = "./visu_prob/"
files = [f"system-{number}.csv" for number in range(1, 20)]
# Systems 3, 5, 6, 8, 11 and 17 do not have sys-thermal readings ! 3/5/6 -> crashes 8/11/17 -> no thermal
remove_entries = [7,10,16]
files = [item for index, item in enumerate(files) if index not in remove_entries]
# --------------------- chosen for analysis ----------------------------------------------------------------------
# for present-ation and analysis: system-19
present = files[-1]
sns.set_style("whitegrid")
sns.set_context("notebook", font_scale=1.2)
np.random.seed(42)
2.1 Data Preprocessing and Basic Analysis¶
- Basic statistical analysis using pandas
-> see load_system_data()
- Original data quality analysis (including visualization)
-> see Analysis Notes after visu_raw_data()
- Data preprocessing
-> see preprocess_system_data() and "data_processed"
- Preprocessed vs original data visual analysis
-> see Analysis Notes after visu_processed_data()
2.2 Visualization and Exploratory Analysis¶
- Time series visualizations
- Distribution analysis with histograms
- Correlation analysis and heatmaps
- Daily pattern analysis
-> see visu_processed_data() and Analysis Notes after visu_processed_data()
- Summary of observed patterns - similar to True/False questions
-> see Analysis Notes after visu_processed_data()
All figures/plots can be accessed in "visu_raw", "visu_processed" and "visu_only_processed".
.
Loading and Filtering¶
Files are fetched from directory and prefiltering for columns of interst.
Processing timestamps to datetime for usage in timeseries (and usability).
Done via a function to execute for every file separately and be able to pipe if necessary.
Returning the dataframe could be either dropped or caught by either a container or piped into the next function.
- 2.3: Basic statistical analysis using pandas
-> output into CSV (visu_raw)
def load_system_data(file_dir: str, file_name: str) -> pd.DataFrame :
"""Load and prepare test system performance data.
Parameters
----------
file_dir : str
Path to the CSV data file location (directory)
file_name : str
Name of the specified CSV file
Additional outputs
saves filtered data into dir "./data_filtered"
Returns
-------
pd.DataFrame
Raw dataframe with columns:
- datetime (index)
- load-15m
- memory_used_pct
- cpu-user
- cpu-system
- sys-thermal
- sys-interrupt-rate
- server-up
- disk-io-time
"""
file_path = file_dir + file_name
df = pd.read_csv(file_path, delimiter = ",",usecols=["timestamp",
"load-15m",
"sys-mem-available",
"sys-mem-total",
"cpu-user",
"cpu-system",
"sys-thermal",
"sys-interrupt-rate",
"server-up",
"disk-io-time"]) # Read in data with columns
df['datetime'] = pd.to_datetime(df['timestamp'], unit = 's', errors = 'coerce') # Create datetime from timestamp
df.set_index('datetime', inplace=True) # Set datetime as index
df["load-15m"] = df["load-15m"] * 100
df['memory_used_pct'] = (1 - df['sys-mem-available']/df['sys-mem-total']) * 100 # Memory usage calculation
df.drop(["timestamp","sys-mem-available","sys-mem-total"], axis=1, inplace=True) # Drop unneccessary data
df.to_csv(pathFilter+file_name, index=True) # output filtered .csv to path (dir)
file_name = file_name.replace(".csv","")
df.describe().to_csv(f'{pathVisuRaw}{file_name}_desciption.csv') # output pandas desscribe() result into .csv to path (dir)
return df
# testing
df = load_system_data(pathRaw,present)
# Pre filter all files
# for file in files:
# load_system_data(pathRaw, file)
Visualizing Raw¶
- 2.1: Original data quality analysis (including visualization)
- 2.2: Time series visualizations
- 2.2: Distribution analysis with histograms
- 2.2: Correlation analysis and heatmaps
- 2.2: Daily pattern analysis
First: Helper functions for interacting with images and os to delete temporary files.
Second: Main function for visualizing
# adapted https://stackoverflow.com/questions/6996603/how-can-i-delete-a-file-or-folder-in-python
def delete_images(files: list) -> None:
"""Deletes the files specified in the list of file paths.
Parameters
----------
files: list[str]
List of names of image files to put into .pdf file.
Additional output
----------
Deltes list of images.
Returns
-------
None
"""
for file in files:
try:
if os.path.exists(file):
os.remove(file)
#print(f"Deleted: {file}")
else:
print(f"File not found: {file}")
except Exception as e:
print(f"Error deleting {file}: {e}")
# adapted https://stackoverflow.com/questions/40906463/png-images-to-one-pdf-in-python
# and https://www.geeksforgeeks.org/save-multiple-matplotlib-figures-in-single-pdf-file-using-python/
def save_image(image_names: list, out_dir: str, filename: str) -> None:
"""Gathers multiple plt.figure obejcts and outputs thm into a .pdf
Parameters
----------
image_names: list[str]
List of names of image files to put into .pdf file
out_dir: str
Path to the directory of output .pdf file
filename: str
Name of output .pdf file
Additional output
----------
Saves a .pdf created by multiple .pngs into specified directory
Returns
-------
None
"""
image_list = [] #contains opened files
for name in image_names:
print(name)
image_list.append(Image.open(name))
image_list[0].save(f"{out_dir}{filename}_allPlots.pdf", save_all=True, append_images=image_list[1:])
for image in image_list:
image.close()
print(f"{out_dir}{filename}_allPlots.pdf")
delete_images(image_names)
def visu_raw_data(show_plots: bool, file_dir: str, file_name: str, df_arg: pd.DataFrame, isRaw: bool = True):
"""Load and visualize filtered and processed test system performance data.
Parameters
----------
show_plots: bool
Just output files or display in notebook
file_dir : str
Path to the CSV data file location (directory)
file_name : str
Name of the specified CSV file
isRaw : bool (Default: True)
function can be used to visualize any raw or processed -> changes data_type (string) and out_dir (string)
optional
df_arg: pd.DataFrame
output from load_system_data()
Additional output
----------
saves visualized data into dir "./visu_raw" by calling save_image() and cleaning temp-files with delete_images()
Returns
-------
None
"""
# Check DataFrame was passed
if isinstance(df_arg, pd.DataFrame):
df = df_arg
# File name and path -> pd used => no identifier => using "./"
out_dir = "./"
out_name = "Visu_output_noident"
print("Function called with a DataFrame.")
else:
# Attempt to read the DataFrame from file
try:
file_path = file_dir + file_name
df = pd.read_csv(file_path, delimiter = ",",usecols=["datetime","load-15m","memory_used_pct","cpu-user","cpu-system","sys-thermal","sys-interrupt-rate","server-up","disk-io-time"])
print(f"Function called with a file: {file_path}")
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
# File name and path -> path used => use identifier
out_dir = pathVisuRaw
out_name = file_name.replace('.csv', '')
except Exception as e:
print(f"Error loading the file: {e}")
return None
measurements = {
"load-15m": ('load-15m', '%'),
"memory_used_pct": ('memory_used_pct', '%'),
"cpu-user": ('cpu-user', 'delta-s'),
"cpu-system": ('cpu-system', 'delta-s'),
"sys-thermal": ('sys-thermal', 'avg delta-°C/min'),
"sys-interrupt-rate": ('sys-interrupt-rate', 'delta-s'),
"disk-io-time": ('disk-io-time', 'delta-s')
#,"server-up": ('server-sup', '')
}
if (isRaw):
data_type = "Raw"
else:
data_type = "Processed"
out_dir = pathOnlyProcessed
image_names = []
image_nr = 0
# Plot 1: Time-Series
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle(f"Tme-Series - {data_type} Data", fontsize=16, y=1.02)
for i,(measure, (title, unit)) in enumerate(measurements.items()): # iterate over all measurements
row = i // 2
col = i % 2
df.iloc[::10].pivot(columns='server-up', values=measure).plot(ax=axes[row, col],alpha=0.7, linewidth=2,color=['red','blue']) # only use every 10th entry
axes[row, col].set_title(f'Time-Series of {measure.upper()}')
axes[row, col].set_xlabel('Datetime')
axes[row, col].set_ylabel(f'{title} ({unit})')
axes[row, col].grid(True)
axes[row, col].legend()
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 2: Daily Patterns
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle(f"Daily Patterns of {data_type} Measurements - mean & std ", fontsize=16, y=1.02)
# Create hour column for grouping
df_hour = df.copy()
df_hour['hour'] = df_hour.index.hour # set index to hour (of day)
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
# Calculate hourly statistics
hourly_stats = df_hour.groupby('hour')[measure].agg(['mean', 'std'])
# Plot mean with standard deviation
axes[row, col].plot(hourly_stats.index, hourly_stats['mean'], 'b-', label='Mean')
axes[row, col].fill_between(
hourly_stats.index,
hourly_stats['mean'] - hourly_stats['std'],
hourly_stats['mean'] + hourly_stats['std'],
alpha=0.2,
label='±1 std'
)
axes[row, col].set_title(f'Daily {title.capitalize()} Pattern')
axes[row, col].set_xlabel('Hour of Day')
axes[row, col].set_ylabel(f'{title} ({unit})')
axes[row, col].grid(True)
axes[row, col].legend()
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 3: Hour-wise Distributions
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle(f" {data_type} Measurement Distributions by Hour - Boxplots", fontsize=16, y=1.02)
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
#calculates and plots hou wise boxplot
df_hour.boxplot(
ax=axes[row, col],
column=measure,
by='hour'
)
axes[row, col].set_title(f'Daily Pattern of {title} ')
axes[row, col].set_xlabel('Hour of Day')
axes[row, col].set_ylabel(f'{title} ({unit})')
axes[row, col].grid(True)
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 4 Histograms - Distribution
fig, axes = plt.subplots(4,2, figsize = (15, 25))
fig.suptitle(f"Sensor {data_type} Measurements Distributions", fontsize = 14)
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
bin_num = 50 # was determined by trial an error
# 200 bins are created for each measurement
axes[row, col].hist(df[measure], bins = bin_num*4, density = True, alpha = 0.7, label = 'Histogram')
axes[row, col].set_title(f'Distribution of {title} ')
axes[row, col].set_xlabel( f'{title} ({unit})')
axes[row, col].set_ylabel('Density')
axes[row, col].grid(True)
#second axis for line graph
# 50 points for the line are used better average behaviour
ax_2 = axes[row, col].twinx()
counts, bins = np.histogram(df[measure], bins = bin_num)
bin_centers = (bins[:-1] + bins [1:]) / 2
ax_2.plot(bin_centers, counts/counts.sum(), 'r-', lw = 2, label = 'Distribution')
ax_2.tick_params(axis='y', labelcolor='r')
ax_2.legend()
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 5: Correlation Analysis
fig, (ax) = plt.subplots(1, 1, figsize=(15, 10))
fig.suptitle(f"Correlation Analysis - of {data_type} Measurements Correlations", y=1.02, fontsize=16)
# Correlations heatmap
sns.heatmap(
df[measurements.keys()].corr(),
annot=True,
cmap='coolwarm',
center=0,
fmt='.2f',
ax=ax
)
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 6 Hexbins
measure = list(measurements.keys())
pairs = [(measure[i], measure[j]) for i in range(len(measure)) for j in range(i + 1, len(measure))] # https://www.w3schools.com/python/python_lists_comprehension.asp
n_rows = 7
n_cols = 3
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
fig.suptitle(f"Hexbins of {data_type} Measurements", y=1.02, fontsize=16)
axes = axes.flatten()
# Loop over pairs
for i, (measure1, measure2) in enumerate(pairs):
ax = axes[i]
x = df[measure1]
y = df[measure2]
title1, unit1 = measurements[measure1]
title2, unit2 = measurements[measure2]
hb = ax.hexbin(x, y, gridsize=100, cmap='viridis')
ax.set_xlabel(f'{title1} ({unit1})')
ax.set_ylabel(f'{title2} ({unit2})')
ax.set_title(f'Hexbin: {title1} vs {title2}')
fig.colorbar(hb, ax=ax)
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 7: Scatter Matrix
# Get data without duplicates by taking mean for each timestamp
df_plot = df.groupby(df.index)[measure].mean()
try:
pp = sns.pairplot(data=df_plot,
diag_kind='kde',
plot_kws={'alpha': 0.5, 's': 20},
height = 3,
corner=True)
except Exception as e:
print(f"Warning: Could not create scatter matrix plot: {str(e)}")
fig = pp.figure
fig.suptitle('Scatter Matrix of Raw Measurements', y=1.02, fontsize=16)
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=200, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
save_image(image_names, out_dir, out_name)
if not show_plots:
plt.close("all")
# testing
visu_raw_data(True, pathFilter, present,None, False)
Function called with a file: ./data_filtered/system-19.csv ./visu_only_processed/system-19_plot_0.png ./visu_only_processed/system-19_plot_1.png ./visu_only_processed/system-19_plot_2.png ./visu_only_processed/system-19_plot_3.png ./visu_only_processed/system-19_plot_4.png ./visu_only_processed/system-19_plot_5.png ./visu_only_processed/system-19_plot_6.png ./visu_only_processed/system-19_allPlots.pdf
#Run Visualization of Raw
# for file in files:
# visu_raw_data(False, pathFilter,file,None)
# plt.close("all") #for safety
Analysis¶
- 2.1: Original data quality analysis (including visualization)
- Has spikes (outliers) especially bad one in sys-interrupt and CPU-user
- Includes Nan- or empty-entries
- large hour-wise distributions due to the outliers
Processing¶
- thresholds- and IQR-method
- aggregation
def remove_outliers_iqr(show_process_status: bool, df:pd.DataFrame, column: str) -> tuple:
"""Remove outliers using IQR method.
Why this method: https://builtin.com/articles/1-5-iqr-rule
Parameters
----------
show_process_status : bool
print status in console
df : pd.DataFrame
input data for cleaning
column: str
current column to "look at"
Returns
-------
(pd.Series, pd.Series)
cleaned data , outliers
"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
# decision range: lower & upper bound
# (*1.5) -> rule comes closest to Gaussian distribution concerning the outlier detection (*1.7 = 3 sigma)
valid_mask = (df[column] >= Q1 - 1.5*IQR) & (df[column] <= Q3 + 1.5*IQR)
invalid_count = (~valid_mask).sum()
if show_process_status:
print(f"IQR: Removing {invalid_count} outliers from {column}")
return df[column].where(valid_mask, np.nan), df[column].where(~valid_mask)
def handle_missing_values(data: pd.DataFrame, column: str, max_gap: int = 8) -> pd.Series: # not used
"""Interpolate missing values with limit.
Parameters
----------
data : pd.DataFrame
_description_
column : str
_description_
max_gap: int, optional
_description_ (Defaults to 8.)
Returns
----------
pd.Series :
_description_
"""
return data[column].interpolate(
method='linear',
limit=max_gap # Only fill gaps up to 8 points
)
def preprocess_system_data(show_process_status: bool, file_dir: str, file_name: str, df_arg: pd.DataFrame = None) -> list:
"""Preprocess system performance data.
Cleans data with:
* Invalid values removed
* Duplicates handled
* Outliers removed
* Missing values interpolated
Parameters
----------
show_process_status: bool
...
file_dir : str
Path to the CSV data file location (directory)
file_name : str
Name of the specified CSV file
optional
df_arg: pd.DataFrame
output from load_system_data()
Returns
-------
df_original
...
df_cleaned
...
str: filename
...
"""
# Check DataFrame was passed
if isinstance(df_arg, pd.DataFrame):
df = df_arg
print("Function called with a DataFrame.")
else:
# Attempt to read the DataFrame from file
try:
file_path = file_dir + file_name
df = pd.read_csv(file_path, delimiter = ",",usecols=["datetime","load-15m","memory_used_pct","cpu-user","cpu-system","sys-thermal","sys-interrupt-rate","server-up","disk-io-time"])
print(f"Function called with a file: {file_path}")
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
except Exception as e:
print(f"Error loading the file: {e}")
return None
# Store original data
df_original = df.copy()
df_outliers = df.copy()
out_dir = pathProcessd
valid_ranges = {
"load-15m": (0, 100),
"memory_used_pct": (0, 100),
"cpu-user": (0.0, 2.0),
"cpu-system": (0.0, 2.0),
"sys-thermal": (-10, 10),
"sys-interrupt-rate": (0, 100000),
"disk-io-time": (0, 1.0),
"server-up": (0, 2)
}
columns = list(valid_ranges.keys())
# 1. Handle invalid values
#(thresholds for sensible bounds determined by examining raw datasets)
for column, (min_val, max_val) in valid_ranges.items():
invalid_mask = (df[column] < min_val) | (df[column] > max_val)
if show_process_status:
print(f"Ranges: Removing {invalid_mask.sum()} invalid values from {column}")
df.loc[invalid_mask, column] = np.nan
# 2. Handle duplicates
if show_process_status:
print("Handling duplicates ...")
df = df.groupby(['datetime', 'server-up']).agg({ #Groups df by combination of: datetime and server-up
'load-15m': 'mean',
'memory_used_pct': 'mean',
'cpu-user': 'mean',
"cpu-system": 'mean',
'sys-thermal': 'mean' ,
"sys-interrupt-rate": 'mean',
"disk-io-time": 'mean'
}).reset_index()
#Result: row unique combination of datetime and server-up -> aggregated by mean (2.5 | 3.5 -> 3.0)
# 3. Remove outliers
for column in columns:
df[column],df_outliers[column] = remove_outliers_iqr(show_process_status, df, column)
# testing df.to_csv("noHandling_data.csv", index=False)
# 4. Handle missing values <-- just delete rows with empty entries -> no interpolation !
# there is ennough data and interpolating seemed like
# misrepresentation of data characteristics and
# as bias introduction
'''if show_process_status:
print("\nHandling missing values...")
print(f"Missing values before handling: \n{df.isnull().sum()}")
for column, (min_val, max_val) in valid_ranges.items():
df[column] = handle_missing_values(df, column,4)'''
df_cleaned = df.dropna()
df_cleaned.set_index('datetime', inplace=True)
# testing print("After dropping empty entries: \n", df_cleaned.head())
# Sort by datetime
df_cleaned.sort_index(inplace=True)
if show_process_status:
print(f"Missing values after handling: \n{df_cleaned.isnull().sum()}")
print(f"\nOriginal shape: {df_original.shape}")
print(f"Cleaned shape: {df_cleaned.shape}")
df_cleaned.to_csv(out_dir+file_name, index=True)
return [df_original, df_cleaned, file_name]
# tesing
preprocess_system_data(True, pathFilter,present,None)
Function called with a file: ./data_filtered/system-19.csv Ranges: Removing 0 invalid values from load-15m Ranges: Removing 0 invalid values from memory_used_pct Ranges: Removing 0 invalid values from cpu-user Ranges: Removing 0 invalid values from cpu-system Ranges: Removing 0 invalid values from sys-thermal Ranges: Removing 0 invalid values from sys-interrupt-rate Ranges: Removing 0 invalid values from disk-io-time Ranges: Removing 0 invalid values from server-up Handling duplicates ... IQR: Removing 2690 outliers from load-15m IQR: Removing 284 outliers from memory_used_pct IQR: Removing 12113 outliers from cpu-user IQR: Removing 12765 outliers from cpu-system IQR: Removing 26 outliers from sys-thermal IQR: Removing 8282 outliers from sys-interrupt-rate IQR: Removing 2732 outliers from disk-io-time IQR: Removing 6 outliers from server-up Missing values after handling: server-up 0 load-15m 0 memory_used_pct 0 cpu-user 0 cpu-system 0 sys-thermal 0 sys-interrupt-rate 0 disk-io-time 0 dtype: int64 Original shape: (86383, 8) Cleaned shape: (68211, 8)
[ load-15m sys-interrupt-rate sys-thermal disk-io-time \
datetime
1970-01-01 00:00:00 2.0 1322.05 0.00 0.0002
1970-01-01 00:00:30 1.0 1236.20 1.40 0.0002
1970-01-01 00:01:00 1.0 1266.25 0.00 0.0002
1970-01-01 00:01:30 1.0 1248.10 1.45 0.0002
1970-01-01 00:02:00 1.0 1290.00 1.45 0.0002
... ... ... ... ...
1970-01-30 23:58:00 0.0 1267.25 1.45 0.0006
1970-01-30 23:58:30 0.0 1266.45 1.50 0.0002
1970-01-30 23:59:00 0.0 1354.75 1.50 0.0004
1970-01-30 23:59:30 0.0 1276.20 1.55 0.0006
1970-01-31 00:00:00 0.0 1289.25 1.55 0.0008
cpu-system cpu-user server-up memory_used_pct
datetime
1970-01-01 00:00:00 0.0255 0.0320 2 15.024197
1970-01-01 00:00:30 0.0210 0.0240 2 15.024452
1970-01-01 00:01:00 0.0220 0.0235 2 15.025318
1970-01-01 00:01:30 0.0245 0.0250 2 15.064915
1970-01-01 00:02:00 0.0305 0.0335 2 15.068279
... ... ... ... ...
1970-01-30 23:58:00 0.0415 0.0425 2 23.212204
1970-01-30 23:58:30 0.0420 0.0435 2 23.207464
1970-01-30 23:59:00 0.0395 0.0460 2 23.194826
1970-01-30 23:59:30 0.0455 0.0430 2 23.291347
1970-01-31 00:00:00 0.0485 0.0390 2 23.290226
[86383 rows x 8 columns],
server-up load-15m memory_used_pct cpu-user \
datetime
1970-01-01 00:00:00 2.0 2.0 15.024197 0.0320
1970-01-01 00:00:30 2.0 1.0 15.024452 0.0240
1970-01-01 00:01:00 2.0 1.0 15.025318 0.0235
1970-01-01 00:01:30 2.0 1.0 15.064915 0.0250
1970-01-01 00:02:00 2.0 1.0 15.068279 0.0335
... ... ... ... ...
1970-01-30 20:53:00 2.0 17.0 16.842316 0.0410
1970-01-30 20:53:30 2.0 17.0 16.952750 0.0410
1970-01-30 20:54:00 2.0 16.0 17.044635 0.0470
1970-01-30 20:54:30 2.0 16.0 17.030467 0.0310
1970-01-30 20:55:00 2.0 15.0 17.126734 0.0320
cpu-system sys-thermal sys-interrupt-rate disk-io-time
datetime
1970-01-01 00:00:00 0.0255 0.00 1322.05 0.0002
1970-01-01 00:00:30 0.0210 1.40 1236.20 0.0002
1970-01-01 00:01:00 0.0220 0.00 1266.25 0.0002
1970-01-01 00:01:30 0.0245 1.45 1248.10 0.0002
1970-01-01 00:02:00 0.0305 1.45 1290.00 0.0002
... ... ... ... ...
1970-01-30 20:53:00 0.0380 0.05 1291.50 0.0000
1970-01-30 20:53:30 0.0395 0.05 1268.60 0.0000
1970-01-30 20:54:00 0.0410 0.05 1261.70 0.0002
1970-01-30 20:54:30 0.0210 0.10 1244.75 0.0000
1970-01-30 20:55:00 0.0180 1.55 1169.05 0.0000
[68211 rows x 8 columns],
'system-19.csv']
# Run processing
# anylist = []
# for file in files:
# anylist = preprocess_system_data(False, pathFilter,file,None)
# print(anylist[0], " : \n", anylist[1].describe(),"\n", anylist[2].describe())
# Test: if processed data makes sense
#cache_list = preprocess_system_data(True, pathFilter,"system-1.csv",None)
visu_raw_data(True, pathOnlyProcessed,present,None,False)
Error loading the file: [Errno 2] No such file or directory: './visu_only_processed/system-19.csv'
#Run Visualization of Processed
# for file in files:
# visu_raw_data(False, pathProcessd,file,None, False)
# plt.close("all") #for safety
Loadfile¶
- in case different sets shall be compared.
- if data is to be loaded into a dataframe instead of directly accessed by a function.
- Otherwise visu_processed_data will be called directly after preprocess_system_data(). Since their IOs are suitable.
def load_file(file_dir: str, file_name: str, create_description: bool = False) -> tuple:
"""Loads file from path and returns dataframe and its name (as tuple).
Parameters
----------
file_dir : str
Path to the CSV data file location (directory)
file_name : str
Name of the specified CSV file
create_description: bool
Additional output
----------
pandas.description of passed dataset into the directory the file was called from.
Returns
--------
tuple(pd.DataFrame, str)
pd.DataFrame: _description_
...
str: file_name
...
"""
try:
file_path = file_dir + file_name
df = pd.read_csv(file_path, delimiter = ",",usecols=["datetime","load-15m","memory_used_pct","cpu-user","cpu-system","sys-thermal","sys-interrupt-rate","server-up","disk-io-time"])
print(f"Function called with a file: {file_path}")
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
# File name and path -> path used => use identifier
file_name = file_name.replace('.csv', '')
if create_description:
df.describe().to_csv(f'{file_dir}{file_name}_desciption.csv')
return (df, file_name)
except Exception as e:
print(f"Error loading the file: {e}")
return (None,None)
#testing
#print(load_file(pathFilter, files[0]))
# creating data description for all processed datasets.
# for file in files:
# load_file(pathProcessd,file, True)
# plt.close("all") #for safety
Visualize Raw & Processed¶
- 2.2: Time series visualizations
- 2.2: Distribution analysis with histograms
- 2.2: Correlation analysis and heatmaps
- 2.2: Daily pattern analysis
def visu_processed_data(show_plots: bool, df_original: pd.DataFrame, df_cleaned: pd.DataFrame, filename: str) -> None:
"""visualize original and processed test system performance data.
Parameters
----------
show_plots: bool
Just output files or display in notebook
df_cleaned: pd.DataFrame
...
df_original: pd.DataFrame
...
filename: str
... for pdf output
Additional outputs
saves visualized data into dir "./visu_processed" by calling save_image() and cleaning temp-files with delete_images()
Returns
-------
None
"""
out_dir = pathVisuProcessed
out_name = filename.replace('.csv','')
image_names = []
image_nr = 0
measurements = {
"load-15m": ('load-15m', '%'),
"memory_used_pct": ('memory_used_pct', '%'),
"cpu-user": ('cpu-user', 'delta-s'),
"cpu-system": ('cpu-system', 'delta-s'),
"sys-thermal": ('sys-thermal', 'avg delta-°C/min'),
"sys-interrupt-rate": ('sys-interrupt-rate', 'delta-s'),
"disk-io-time": ('disk-io-time', 'delta-s')
#,"server-up": ('server-sup', '')
}
measures = list(measurements.keys())
# Plot 1: Time-Series
# see general system behaviour and easy to spot anomalies
# e.g. data-error -> blank ... or system-3: sucessive rise in memory_usage would suggest a memeory-leak
# system-19 shows a clear problem around 16-17.01. visible in load-15m, memory_used and best visible in sisk-io (processed data)
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle('Tme-Series - Raw Data', fontsize=16, y=1.02)
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
df_original[measure].iloc[::10].plot(ax=axes[row, col], color='lightblue', alpha=0.3, label='Original')
df_cleaned[measure].iloc[::10].plot(ax=axes[row, col], color='green', alpha=0.5, label='Cleaned')
axes[row, col].set_title(f'Time-Series of {measure.upper()}')
axes[row, col].set_xlabel('Datetime')
axes[row, col].set_ylabel(f'{title} ({unit})')
axes[row, col].grid(True)
axes[row, col].legend()
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 2: Daily Patterns
# shows hourly behaviour (mean and std) overlaying original / processed shows better (thighter) std
# but also might suggest loss of relevant data by processing: cpu-system behaviour and sys-interrupt
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle('Daily Patterns of Raw & Processed Measurements - mean & std ', fontsize=16, y=1.02)
# Create hour column for grouping
df_hour_orig = df_original.copy()
df_hour_clean = df_cleaned.copy()
df_hour_orig['hour'] = df_hour_orig.index.hour
df_hour_clean['hour'] = df_hour_clean.index.hour
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
# Calculate hourly statistics
hourly_stats_orig = df_hour_orig.groupby('hour')[measure].agg(['mean', 'std'])
hourly_stats_clean = df_hour_clean.groupby('hour')[measure].agg(['mean', 'std'])
# Plot mean with standard deviation
axes[row, col].plot(hourly_stats_clean.index, hourly_stats_clean['mean'],
'g-', label='Mean Processed')
axes[row, col].fill_between(
hourly_stats_clean.index,
hourly_stats_clean['mean'] - hourly_stats_clean['std'],
hourly_stats_clean['mean'] + hourly_stats_clean['std'],
alpha=0.3,
color='lightgreen',
label='±1 std Processed'
)
axes[row, col].plot(hourly_stats_orig.index, hourly_stats_orig['mean'],
'b-', label='Mean Raw')
axes[row, col].fill_between(
hourly_stats_orig.index,
hourly_stats_orig['mean'] - hourly_stats_orig['std'],
hourly_stats_orig['mean'] + hourly_stats_orig['std'],
alpha=0.2,
color='lightblue',
label='±1 std Raw'
)
#ax_2.tick_params(axis='y', labelcolor='b')
axes[row, col].set_title(f'Daily {title.capitalize()} Pattern')
axes[row, col].set_xlabel('Hour of Day')
axes[row, col].set_ylabel(f'{title} ({unit})')
axes[row, col].grid(True)
axes[row, col].legend()
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
del df_hour_clean, df_hour_orig
# Plot 4 Histograms - Distribution (only original data)
# how is data distibuted, are ther multiple peaks (accumulations),
# problematic sice every system and every metric would need their specific amount of bins and averaging-line
# 50/200 was deemed the best compromise => most figures suggest normal distributions (sigular and multimodal)
# in case of system-19:
# cpus-user &-system aswell as sys-interrupt are normal distributed.
# load-15m (bad bin-amount), memory_used and sys-thermal are multi-modla
# disk-io looks to be multimodal, but in this systems-case there might be a problem (low-res. sensor or bad software readout)
# since compared to e.g. system-4 (two peaks) the datapoints are in suspiciously periodic (consistant) distnaces (0.002 delta-s)
fig, axes = plt.subplots(4,2, figsize = (15, 25))
fig.suptitle('Sensor Processed Measurements Distributions', fontsize = 14)
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
bin_num = 50
axes[row, col].hist(df_cleaned[measure], bins = bin_num*4, density = True, alpha = 0.7)
axes[row, col].set_title(f'Distribution of {title} ')
axes[row, col].set_xlabel( f'{title} ({unit})')
axes[row, col].set_ylabel('Density')
axes[row, col].grid(True)
#second axis for line graph
ax_2 = axes[row, col].twinx()
counts, bins = np.histogram(df_cleaned[measure], bins = bin_num)
bin_centers = (bins[:-1] + bins [1:]) / 2
ax_2.plot(bin_centers, counts/counts.sum(), 'r-', lw = 2, label = 'Distribution')
ax_2.tick_params(axis='y', labelcolor='r')
ax_2.legend()
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 5: Correlation Analysis
# how does one metric correlate with another ? does cpu-user lead to high load-15m ?
# highest correlations:
# cpu-system(/-user) & sys-interrupts
# cpu-system & -user (which is to be expected)
# cpu-user & memory_used (also expected as a working cpu probably requests larger data outside its L1-3 caches)
# sys-thermals show no correlation (counter-intuitive): assumed reason being that value is absolute temp-change -> hard to correlate cooling/heating
# also interesting that there is "low" correlation between memory_used and disk-io
# -> if information is requested for a process (calculation) from disk it should be moved to RAM (system-4 correl. suggests this)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
fig.suptitle('Correlation Analysis - Original vs Cleaned', y=1.02, fontsize=16)
# Original correlations
sns.heatmap(
df_original[measures].corr(),
annot=True,
cmap='Blues', #coolwarm
center=0,
fmt='.2f',
ax=ax1
)
ax1.set_title('Original Data Correlations')
# Cleaned correlations
sns.heatmap(
df_cleaned[measures].corr(),
annot=True,
cmap='Greens',
center=0,
fmt='.2f',
ax=ax2
)
ax2.set_title('Cleaned Data Correlations')
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 6 Hexbins
# Visualizing dense (aggregating) data, see trends and clustering (relationships), color gradients make it easier to digest and interopret
# poor results for some relations
pairs = [(measures[i], measures[j]) for i in range(len(measures)) for j in range(i + 1, len(measures))]
n_rows = 7
n_cols = 3
fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
fig.suptitle('Hexbins of Processed Measurements', y=1.02, fontsize=16)
axes = axes.flatten()
for i, (measure1, measure2) in enumerate(pairs):
ax = axes[i]
x = df_cleaned[measure1]
y = df_cleaned[measure2]
title1, unit1 = measurements[measure1]
title2, unit2 = measurements[measure2]
hb = ax.hexbin(x, y, gridsize=100, cmap='viridis')
ax.set_xlabel(f'{title1} ({unit1})')
ax.set_ylabel(f'{title2} ({unit2})')
ax.set_title(f'Hexbin: {title1} vs {title2}')
fig.colorbar(hb, ax=ax)
#---------------------------------------------- File output
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Plot 7: Scatter Matrix
# similar to hexbins -> was used to see clear distiction before and after processing
df_plot1 = df_original
df_plot1['State'] = 'raw'
df_plot2 = df_cleaned
df_plot2['State'] = 'processed'
df_plot1 = pd.concat([df_plot1, df_plot2])
#testing
# df_plot1.to_csv(out_dir+"concatPlot1Plot2.csv", index=True)
# print(df_plot1.shape, " and ", df_plot2.shape)
# print(df_plot1.head(), "\n", df_plot2.head())
del df_plot2
pp = None
try:
pp = sns.pairplot(data=df_plot1,
diag_kind='kde',
vars = measures,
hue='State',
markers=["o","D"],
plot_kws={'alpha': 0.5, 's': 20},
height = 3,
corner=True)
except Exception as e:
print(f"Warning: {str(e)}")
fig = pp.figure
fig.suptitle('Scatter Matrix of Raw and Processed Measurements', y=1.02, fontsize=16)
del df_plot1
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=200, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
save_image(image_names, out_dir, out_name)
if not show_plots:
plt.close("all")
# testing
# load same file filtered and processed
ca1 = load_file(pathFilter,present)
ca2 = load_file(pathProcessd,present)
#visualize it (exporting pdf)
visu_processed_data(True, ca1[0], ca2[0], ca2[1])
Function called with a file: ./data_filtered/system-19.csv Function called with a file: ./data_processed/system-19.csv ./visu_processed/system-19_plot_0.png ./visu_processed/system-19_plot_1.png ./visu_processed/system-19_plot_2.png ./visu_processed/system-19_plot_3.png ./visu_processed/system-19_plot_4.png ./visu_processed/system-19_plot_5.png ./visu_processed/system-19_allPlots.pdf
# anylist = []
# for file in files:
# anylist = preprocess_system_data(False, pathFilter,file,None)
# visu_processed_data(False, anylist[0],anylist[1],anylist[2])
# #visu_processed_data(False, preprocess_system_data(False, pathFilter,file,None))
Analysis¶
- 2.1: Preprocessed vs original data visual analysis
- see code
- reduced outlier, higher correlation values
- other processing methods might be considered for cpu-metrics (see scatterplot)
- 2.3: Summary of observed patterns
- see code
- System load shows clear cycles (assumed testing in day-night pattern)
- Memory usage coincide with CPU usage
- apart from outliers system load stays within 0.3
- memory usage seems abnormally high, as it stays above 10% throughout the dataset
2.3 Probability Analysis¶
- Threshold-based probability estimation
- Cross tabulation analysis
- Conditional probability analysis
- Summary of observations from each task
def prob_analysis(show_plots: bool, file_dir: str, file_name: str, df_arg: pd.DataFrame = None) -> None:
"""Prob analysis: thresholds and conditional based on mean, 10%(max) and 90%(max) of metrics
visualized as tables and specific. crosstable for threshold: mean
Parameters
----------
show_process_status: bool
...
file_dir : str
Path to the CSV data file location (directory)
file_name : str
Name of the specified CSV file
optional
df_arg: pd.DataFrame
output from load_system_data()
Additional outputs
saves visualized data into dir "./visu_prob" by calling save_image() and cleaning temp-files with delete_images()
Returns
-------
None
"""
# Check DataFrame was passed
if isinstance(df_arg, pd.DataFrame):
df = df_arg
out_dir = "./"
out_name = "Visu_output_noident"
print("Function called with a DataFrame.")
else:
# Attempt to read the DataFrame from file
try:
file_path = file_dir + file_name
df = pd.read_csv(file_path, delimiter = ",",usecols=["datetime","load-15m","memory_used_pct","cpu-user","cpu-system","sys-thermal","sys-interrupt-rate","server-up","disk-io-time"])
print(f"Function called with a file: {file_path}")
df['datetime'] = pd.to_datetime(df['datetime'])
df.set_index('datetime', inplace=True)
# File name and path -> path used => use identifier
out_dir = pathProb
out_name = file_name.replace('.csv', '')
except Exception as e:
print(f"Error loading the file: {e}")
return None
image_names = []
image_nr = 0
measurements = {
"load-15m": ('load-15m', '%'),
"memory_used_pct": ('memory_used_pct', '%'),
"cpu-user": ('cpu-user', 'delta-s'),
"cpu-system": ('cpu-system', 'delta-s'),
"sys-thermal": ('sys-thermal', 'avg delta-°C/min'),
"sys-interrupt-rate": ('sys-interrupt-rate', 'delta-s'),
"disk-io-time": ('disk-io-time', 'delta-s')
}
measures = list(measurements.keys())
threshold_results = pd.DataFrame(columns=["Measurement", "Threshold", "Threshold_Value", "Probability_Larger", "Probability_Smaller"])
#cross_tab_results = pd.DataFrame()
conditional_prob_results = pd.DataFrame(columns=["Condition", "Probability"])
# Example placeholders for figures
fig, ax = plt.subplots(figsize=(10, 10))
# Placeholder for populating threshold-based probability estimation results
for measure in measures:
thresholds = [('mean',df[measure].mean()), ('10%(max)', df[measure].max()*0.1), ('90%(max)',df[measure].max()*0.9)] # Example thresholds
for name,threshold in thresholds:
# Placeholder calculation (replace with actual logic)
probability_1 = np.mean(df[measure] > threshold)
probability_2 = 1 - probability_1
threshold = np.round(threshold, decimals=4)
probability_1 = np.round(probability_1, decimals=4)
probability_2 = np.round(probability_2, decimals=4)
threshold_results = pd.concat([
threshold_results,
pd.DataFrame({"Measurement":[measure], "Threshold": [name], "Threshold_Value": [threshold], "Probability_Larger": [probability_1], "Probability_Smaller": [probability_2]})
], ignore_index=True)
# Output threshold results to a figure
ax.axis('off') # Turn off axis
ax.table(cellText=threshold_results.values, colLabels=threshold_results.columns, loc='center')
ax.set_title("Threshold-based Probability Estimation")
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
fig, ax = plt.subplots(figsize=(8, 6))
crosstable_data = {}
for measure in measures:
threshold_mean = df[measure].mean()
total_count = len(df[measure])
larger_count = (df[measure] > threshold_mean).sum()
not_larger_count = total_count - larger_count
crosstable_data[measure] = {
" x > Mean": larger_count,
" x < Mean": not_larger_count,
"Total": total_count
}
# Convert to DataFrame for visualization
crosstable_df = pd.DataFrame(crosstable_data).transpose()
crosstable_df.index.name = "Measurement"
# Display the crosstable
ax.axis('off') # Hide axis
ax.table(
cellText=crosstable_df.values,
rowLabels=crosstable_df.index,
colLabels=crosstable_df.columns,
loc='center'
)
ax.set_title("Crosstable Analysis Based on Mean Threshold")
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
fig, ax = plt.subplots(figsize=(10, 20))
# Generate pairs
pairs = [(measures[i], measures[j]) for i in range(len(measures)) for j in range(i + 1, len(measures))]
conditional_prob_results = pd.DataFrame(columns=["Condition", "Probability"])
# Loop over the pairs
for i, (measure1, measure2) in enumerate(pairs):
# mean of boolean union returns conditional probability
thresholds = [
(f"P({measure1} > (mean) | {measure2} > (mean))", lambda: np.mean((df[measure1] > df[measure1].mean()) & (df[measure2] > df[measure2].mean()))),
(f"P({measure1} < (mean) | {measure2} > (mean))", lambda: np.mean((df[measure1] < df[measure1].mean()) & (df[measure2] > df[measure2].mean()))),
(f"P({measure1} > 90%(max) | {measure2} > 90%(max))", lambda: np.mean((df[measure1] > df[measure1].max() * 0.9) & (df[measure2] > df[measure2].max() * 0.9))),
(f"P({measure1} < 10%(max) | {measure2} < 10%(max))", lambda: np.mean((df[measure1] < df[measure1].max() * 0.1) & (df[measure2] < df[measure2].max() * 0.1))),
(f"P({measure1} < 10%(max) | {measure2} > 90%(max))", lambda: np.mean((df[measure1] < df[measure1].max() * 0.1) & (df[measure2] > df[measure2].max() * 0.9))),
(f"P({measure2} < 10%(max) | {measure1} > 90%(max))", lambda: np.mean((df[measure2] < df[measure2].max() * 0.1) & (df[measure1] > df[measure1].max() * 0.9))),
(f"P({measure1} > 90%(max) | {measure2} < 10%(max))", lambda: np.mean((df[measure1] > df[measure1].max() * 0.9) & (df[measure2] < df[measure2].max() * 0.1))),
(f"P({measure2} > 90%(max) | {measure1} < 10%(max))", lambda: np.mean((df[measure2] > df[measure2].max() * 0.9) & (df[measure1] < df[measure1].max() * 0.1))),
]
for condition, calc in thresholds:
probability = calc()
probability = np.round(probability, decimals=4)
if probability >= 0.001: # Only add rows with probability >= 0.0001
conditional_prob_results = pd.concat([
conditional_prob_results,
pd.DataFrame({"Condition": [condition], "Probability": [probability]})
], ignore_index=True)
# Display or further process the results as needed
ax.axis('off')
ax.table(cellText=conditional_prob_results.values, colLabels=conditional_prob_results.columns, loc='center')
ax.set_title("Conditional Probability Analysis")
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
save_image(image_names, out_dir, out_name)
fig, ax = plt.subplots(figsize=(10, 20))
conditional_prob_results = conditional_prob_results.sort_values(by="Probability", ascending=False)
ax.axis('off')
ax.table(cellText=conditional_prob_results.values, colLabels=conditional_prob_results.columns, loc='center')
ax.set_title("Conditional Probability Analysis")
if not show_plots:
plt.close("all")
#testing
prob_analysis(True, pathProcessd, present, None)
Function called with a file: ./data_processed/system-19.csv
C:\Users\rapha\AppData\Local\Temp\ipykernel_62264\881255731.py:76: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. threshold_results = pd.concat([ C:\Users\rapha\AppData\Local\Temp\ipykernel_62264\881255731.py:155: FutureWarning: The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation. conditional_prob_results = pd.concat([
./visu_prob/system-19_plot_0.png ./visu_prob/system-19_plot_1.png ./visu_prob/system-19_plot_2.png ./visu_prob/system-19_allPlots.pdf
# # Run analysis for alldatasets
# for file in files:
# prob_analysis(False, pathProcessd, file, None)
Analysis¶
- 2.3 Summary of observations from each task
- even though basic, it is still interesting to see the deviations of distributions between mean and madian.
- Also the "directionality" of conditional probability can be observed
2.3 Statistical Theory Applications¶
- Law of Large Numbers demonstration
- Central Limit Theorem application
- Result interpretation
...
# Initial setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
pathStatisticslTheory = "./visu_statistical_theory/"
def visualize_statistical_theory(show_plots: bool, df_cleaned: pd.DataFrame, filename: str) -> None :
"""
Visualize the law of large numbers and the central limit theorem using cleaned data
Parameters
----------
show_plots: bool
Just output files or display in notebook
df_cleaned: pd.DataFrame
...
filename: str
... for pdf output
Additional outputs
saves visualized data into dir "./visu_statistical_theory" by calling save_image() and cleaning temp-files with delete_images()
Returns
-------
None
"""
out_dir = pathStatisticslTheory
out_name = filename.replace('.csv','')
image_names = []
image_nr = 0
measurements = {
"load-15m": ('load-15m', '%'),
"memory_used_pct": ('memory_used_pct', '%'),
"cpu-user": ('cpu-user', 'delta-s'),
"cpu-system": ('cpu-system', 'delta-s'),
"sys-thermal": ('sys-thermal', 'avg delta-°C/min'),
"sys-interrupt-rate": ('sys-interrupt-rate', 'delta-s'),
"disk-io-time": ('disk-io-time', 'delta-s')
}
# The Law of Large Numbers
# Setup samples and threshold
sample_size = 10000
# Setup plots
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle(f'Demonstration of the Law of Large Numbers - Convergence over Sample Size- Processed Data - (Sample size = {sample_size})', fontsize=16, y=1.02)
# Setup data with a copy with resettint the datetime index to an index (not sure if needed actually)
reset_index_cleaned = df_cleaned.reset_index(drop=True)
# Create plots for each measurement of a system
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
# Calculate true probability
threshold = np.mean(df_cleaned[measure])
calc_prob = np.mean(reset_index_cleaned[measure] > threshold)
# Calculate the observed probability that the sample value is greater that the threshold
observed_probs = []
for n in range(sample_size):
sample = reset_index_cleaned[measure].sample(n=n, replace=True)
observed_prob = np.mean(sample > threshold)
observed_probs.append(observed_prob)
# Convergence over sample size
axes[row, col].plot(range(sample_size), observed_probs, 'b-', label=f'Obseverd Probability')
axes[row, col].axhline(y=calc_prob, color='r', linestyle='--', label=f'Calculated Probability ({calc_prob:.3f})')
axes[row, col].set_title(f'The Law of Large Numbers of {measure.upper()}')
axes[row, col].set_xlabel(f'Sample Size (log scale)')
axes[row, col].set_ylabel(f'Probability of {title} > {threshold:.5f}')
axes[row, col].grid(True)
axes[row, col].set_xscale('log')
axes[row, col].legend()
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# Setup plots
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle(f'Demonstration of the Law of Large Numbers - Errors vs Sample Size - Processed Data - (Sample size = {sample_size})', fontsize=16, y=1.02)
# Create plots for each measurement of a system
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
# Calculate true probability
threshold = np.mean(df_cleaned[measure])
calc_prob = np.mean(reset_index_cleaned[measure] > threshold)
# Calculate the observed probability that the sample value is greater that the threshold
observed_probs = []
for n in range(sample_size):
sample = reset_index_cleaned[measure].sample(n=n, replace=True)
observed_prob = np.mean(sample > threshold)
observed_probs.append(observed_prob)
# Error vs sample size
errors = np.abs(np.array(observed_probs) - calc_prob)
axes[row, col].plot(range(sample_size), errors, 'g-', alpha=0.6)
axes[row, col].set_xscale('log')
axes[row, col].set_yscale('log')
axes[row, col].set_xlabel('Sample Size (log scale)')
axes[row, col].set_ylabel('Absolute Error (log scale)')
axes[row, col].set_title('Convergence Error')
axes[row, col].grid(True)
# Print summary statistics
print(f"\nLaw of Large Numbers Summary for {measure}")
print("-" * 50)
print(f"True Probability: {calc_prob:.4f}")
print("\nConvergence at different sample sizes:")
for size in [10, 100, 1000, 10000]:
#idx = np.argmin(np.abs(sample_size - size))
idx = min(size, sample_size) - 1
error = errors[idx]
print(f"n={size:5d}: P={observed_probs[idx]:.4f} (error: {error:.4f})")
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
# The Central Limit Theorem
# Setup sample sizea and number of samples
sample_size = [1, 5, 10, 50] # number of data points drawn in a single run -> better approximation of the true mean
n_samples = 1000 # number of times the sampling process is repeated
# Run using differen sample sizes
for sample_size in sample_size :
# Setup figures
fig, axes = plt.subplots(4, 2, figsize=(15, 30))
fig.suptitle(f'Demonstration of the Central Limit Theorem - Processed Data - (sample_size = {sample_size}; n_samples = {n_samples})' , fontsize=16, y=1.02)
# Create Histrograms
for i,(measure, (title, unit)) in enumerate(measurements.items()):
row = i // 2
col = i % 2
# Calculate Sample means
sample_means = np.array([np.mean(np.random.choice(df_cleaned[measure], size=sample_size))
for _ in range(n_samples)])
# Histogram of sample means
sns.histplot(data=sample_means, kde=True, ax=axes[row, col], label = 'Observed Distribution')
# Add theoretical normal curve
x = np.linspace(min(sample_means), max(sample_means), 100)
data_mean = np.mean(df_cleaned[measure])
data_std = np.std(df_cleaned[measure])
theoretical_std = data_std / np.sqrt(sample_size)
theoretical = stats.norm.pdf(x, data_mean, theoretical_std)
axes[row, col].plot(x, theoretical * len(sample_means) * (max(sample_means) - min(sample_means)) / 50,
'r--', label='Calculated Normal')
axes[row, col].legend()
axes[row, col].set_title(f'Sampling Distribution of {measure}')
axes[row, col].set_xlabel(f'Sample Mean')
axes[row, col].set_ylabel(f'Frequency')
axes[row, col].legend()
#----------------------------------------------
plt.tight_layout()
temp_name = f"{out_dir}{out_name}_plot_{image_nr}.png"
fig.savefig(temp_name, dpi=150, bbox_inches='tight')
image_names.append(temp_name)
image_nr += 1
#----------------------------------------------
save_image(image_names, out_dir, out_name)
if not show_plots:
plt.close("all")
# Visualize the Statistical Theory
analist = []
anylist = preprocess_system_data(False, pathFilter, present, None)
visualize_statistical_theory(True, anylist[1],anylist[2])
# Run analysis for alldatasets
#for file in files:
#anylist = preprocess_system_data(False, pathFilter,file,None)
#visualize_statistical_theory(False, anylist[1],anylist[2])
Function called with a file: ./data_filtered/system-19.csv Law of Large Numbers Summary for load-15m -------------------------------------------------- True Probability: 0.4944 Convergence at different sample sizes: n= 10: P=0.4444 (error: 0.0499) n= 100: P=0.4545 (error: 0.0398) n= 1000: P=0.4885 (error: 0.0059) n=10000: P=0.4945 (error: 0.0002) Law of Large Numbers Summary for memory_used_pct -------------------------------------------------- True Probability: 0.4047 Convergence at different sample sizes: n= 10: P=0.2222 (error: 0.1825) n= 100: P=0.3939 (error: 0.0108) n= 1000: P=0.4124 (error: 0.0077) n=10000: P=0.4061 (error: 0.0014) Law of Large Numbers Summary for cpu-user -------------------------------------------------- True Probability: 0.3731 Convergence at different sample sizes: n= 10: P=0.3333 (error: 0.0398) n= 100: P=0.3434 (error: 0.0297) n= 1000: P=0.4044 (error: 0.0313) n=10000: P=0.3670 (error: 0.0061) Law of Large Numbers Summary for cpu-system -------------------------------------------------- True Probability: 0.3327 Convergence at different sample sizes: n= 10: P=0.6667 (error: 0.3340) n= 100: P=0.3131 (error: 0.0195) n= 1000: P=0.3353 (error: 0.0027) n=10000: P=0.3353 (error: 0.0027) Law of Large Numbers Summary for sys-thermal -------------------------------------------------- True Probability: 0.5631 Convergence at different sample sizes: n= 10: P=0.6667 (error: 0.1035) n= 100: P=0.5354 (error: 0.0278) n= 1000: P=0.5726 (error: 0.0095) n=10000: P=0.5628 (error: 0.0004) Law of Large Numbers Summary for sys-interrupt-rate -------------------------------------------------- True Probability: 0.4201 Convergence at different sample sizes: n= 10: P=0.2222 (error: 0.1979) n= 100: P=0.5152 (error: 0.0950) n= 1000: P=0.4064 (error: 0.0137) n=10000: P=0.4314 (error: 0.0113) Law of Large Numbers Summary for disk-io-time -------------------------------------------------- True Probability: 0.4913 Convergence at different sample sizes: n= 10: P=0.6667 (error: 0.1753) n= 100: P=0.5152 (error: 0.0238) n= 1000: P=0.5275 (error: 0.0362) n=10000: P=0.4900 (error: 0.0013) ./visu_statistical_theory/system-19_plot_0.png ./visu_statistical_theory/system-19_plot_1.png ./visu_statistical_theory/system-19_plot_2.png ./visu_statistical_theory/system-19_plot_3.png ./visu_statistical_theory/system-19_plot_4.png ./visu_statistical_theory/system-19_plot_5.png ./visu_statistical_theory/system-19_allPlots.pdf
2.3 Regression Analysis¶
- Linear/Polynomial model selection
- Model fitting and validation
- Result interpretation and analysis
...
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from matplotlib.backends.backend_pdf import PdfPages
def fit_polynomial_regression(X, y, degree, scaled=True):
"""Fit polynomial regression with optional scaling."""
if scaled:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
else:
X_scaled = X
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X_scaled)
model = LinearRegression()
model.fit(X_poly, y)
if scaled:
return model, poly, scaler
return model, poly, None
def evaluate_model(model, poly, scaler, X, y):
"""Evaluate model performance."""
if scaler is not None:
X_transformed = scaler.transform(X)
else:
X_transformed = X
X_poly = poly.transform(X_transformed)
y_pred = model.predict(X_poly)
r2 = r2_score(y, y_pred)
rmse = np.sqrt(mean_squared_error(y, y_pred))
return r2, rmse, y_pred
def plot_polynomial_fits(X, y, max_degree=6):
"""Plot polynomial fits of different degrees."""
fig, axes = plt.subplots(2, 3, figsize=(15, 12))
axes = axes.ravel()
for degree in range(1, max_degree + 1):
model, poly, scaler = fit_polynomial_regression(X, y, degree)
X_plot = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
X_plot_scaled = scaler.transform(X_plot)
X_plot_poly = poly.transform(X_plot_scaled)
y_plot = model.predict(X_plot_poly)
r2, rmse, y_pred = evaluate_model(model, poly, scaler, X, y)
ax = axes[degree - 1]
ax.scatter(X, y, color='blue', alpha=0.5, label='Data', rasterized=True) # Rasterized so that it doesnt save SVG to PDF
ax.plot(X_plot, y_plot, color='red', label=f'Degree {degree}')
ax.set_title(f'Polynomial Degree {degree}\nR² = {r2:.3f}, RMSE = {rmse:.3f}')
ax.legend()
ax.grid(True)
plt.tight_layout()
return fig
def compare_cv_scores(X, y, max_degree=6):
"""Compare cross-validation scores for different polynomial degrees."""
train_scores = []
cv_scores = []
degrees = range(1, max_degree + 1)
for degree in degrees:
# Create polynomial features
poly = PolynomialFeatures(degree=degree)
X_poly = poly.fit_transform(X)
# Fit model and get scores
model = LinearRegression()
train_score = model.fit(X_poly, y).score(X_poly, y)
cv_score = np.mean(cross_val_score(model, X_poly, y, cv=5))
train_scores.append(train_score)
cv_scores.append(cv_score)
# Plot scores
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(degrees, train_scores, 'o-', label='Training Score')
ax.plot(degrees, cv_scores, 'o-', label='CV Score')
ax.set_xlabel('Polynomial Degree')
ax.set_ylabel('R² Score')
ax.set_title('Model Performance vs. Polynomial Degree')
ax.legend()
ax.grid(True)
return fig
def plot_residuals(model, poly, scaler, X, y):
"""Plot residuals analysis."""
# Get predictions
if scaler is not None:
X_scaled = scaler.transform(X)
else:
X_scaled = X
X_poly = poly.transform(X_scaled)
y_pred = model.predict(X_poly)
residuals = y - y_pred
# Create plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
# Residuals vs Predicted
ax1.scatter(y_pred, residuals, alpha=0.5, rasterized=True)
ax1.axhline(y=0, color='r', linestyle='--')
ax1.set_xlabel('Predicted Values')
ax1.set_ylabel('Residuals')
ax1.set_title('Residuals vs Predicted')
ax1.grid(True)
# Q-Q plot had to do it this way, since it saved SVG and it takes long time to open in PDF (refer to StackOverFlow)
(osm, osr), (slope, intercept, r) = stats.probplot(residuals, dist="norm")
ax2.plot(osm, osr, 'o', markersize=4, rasterized=True) # Rasterized Q-Q points
ax2.plot(osm, slope * osm + intercept, color='red', linestyle='--') # Line
ax2.set_title('Q-Q Plot')
ax2.set_xlabel('Theoretical Quantiles')
ax2.set_ylabel('Sample Quantiles')
ax2.grid(True)
return fig
# Path to the folder containing CSV files
data_folder = 'data_processed/'
output_folder = 'regression_visu/'
# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)
# Loop through all files in the folder
for file_name in os.listdir(data_folder):
if file_name.endswith('.csv'):
file_path = os.path.join(data_folder, file_name)
print(f"Processing file: {file_name}")
# Load dataset and extract features
data = pd.read_csv(file_path)
X = data['cpu-user'].values.reshape(-1, 1)
y = data['cpu-system']
# Create a PDF to save all plots for this file
output_pdf = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_analysis.pdf")
with PdfPages(output_pdf) as pdf:
# Visualize polynomial fits
print("Different Polynomial Degrees done")
plot_polynomial_fits(X, y)
pdf.savefig() # Save current figure to the PDF
plt.close()
# Compare cross-validation scores
print("Cross-validation done")
compare_cv_scores(X, y)
pdf.savefig() # Save current figure to the PDF
plt.close()
# Analyze detailed performance of the quadratic model
print("Analysis of Quadratic Model done")
model, poly, scaler = fit_polynomial_regression(X, y, degree=6)
r2, rmse, _ = evaluate_model(model, poly, scaler, X, y)
# Visualize residuals
print("Residual Analysis done\n")
plot_residuals(model, poly, scaler, X, y)
pdf.savefig() # Save current figure to the PDF
plt.close()
print(f"Analysis complete. Results saved in folder {output_folder}")
Processing file: system-1.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-10.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-12.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-13.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-14.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-15.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-16.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-18.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-19.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-2.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-3.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-4.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-5.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-6.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-7.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Processing file: system-9.csv Different Polynomial Degrees done Cross-validation done Analysis of Quadratic Model done Residual Analysis done Analysis complete. Results saved in folder regression_visu/